Load raw data, annotate probes using biomaRt and load SFARI genes
# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
print('Columns in datExpr don\'t match the rowd in datMeta!')
}
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
#################################################################################
# FILTERS:
# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
# 2. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
#################################################################################
# Annotate SFARI genes
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
datExpr_backup = datExpr
rm(getinfo, to_keep, gene_names, mart)
Number of genes:
nrow(datExpr)
## [1] 63677
Gene count by SFARI score:
table(SFARI_genes$`gene-score`)
##
## 1 2 3 4 5 6
## 29 82 209 538 191 25
Gene count by brain lobe:
table(datMeta$Brain_lobe)
##
## Frontal Temporal Parietal Occipital
## 21 20 22 23
Gene count by SFARIscore and brain lobe:
t(table(datMeta$Brain_lobe, datMeta$Diagnosis_))
##
## Frontal Temporal Parietal Occipital
## ASD 8 14 14 15
## CTL 13 6 8 8
Regions: Frontal, Temporal, Parietal and Occipital
y axis cut at 1000 to remove outliers
The distributions by score seem very similar between regions
make_ASD_vs_CTL_df = function(datExpr, lobe){
datMeta_lobe = datMeta %>% filter(Brain_lobe==lobe & rownames(datMeta) %in% colnames(datExpr))
datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta_lobe$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta_lobe$Diagnosis_!='ASD'))
ASD_vs_CTL = data.frame('ID'=as.character(rownames(datExpr)),
'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
left_join(SFARI_genes, by='ID') %>%
dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))
return(ASD_vs_CTL)
}
p = list()
for(lobe in names(table(datMeta$Brain_lobe))){
datExpr_lobe = datExpr %>% dplyr::select(which(datMeta$Brain_lobe==lobe))
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_lobe, lobe)
plot = ggplotly(ggplot(ASD_vs_CTL, aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) +
geom_boxplot() + theme_minimal() + ylim(0, 1000) +
scale_fill_manual(values=gg_colour_hue(7)) +
theme(legend.position = 'none'))
p[lobe] = list(plot)
}
subplot(p[[1]], p[[2]], p[[3]], p[[4]], nrows=2)
rm(p, lobe, datExpr_lobe, plot)
datExpr = datExpr_backup # Just in case
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_+Region)
#Estimate size factors
dds = estimateSizeFactors(dds)
vst_output = vst(dds)
datExpr = assay(vst_output)
# Filter out genes with 0 variance (filter 19659)
to_keep = apply(datExpr, 1, sd)>0.1
datExpr = datExpr[to_keep,]
# Filter out genes with mean < 3 (filter 13547)
to_keep = rowMeans(datExpr)>3.3
datExpr = datExpr[to_keep,] %>% data.frame
datExpr_post_Norm = datExpr
rm(counts, rowRanges, se, dds, vst_output, to_keep)
Regions: Frontal, Temporal, Parietal and Occipital
Similar behaviour in all regions
datExpr = datExpr_post_Norm # Should be equal, jut in case
p = list()
for(lobe in names(table(datMeta$Brain_lobe))){
datExpr_lobe = datExpr %>% dplyr::select(which(datMeta$Brain_lobe==lobe))
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_lobe, lobe)
plot = ggplotly(ggplot(ASD_vs_CTL, aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) +
geom_boxplot() + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(7)) +
theme(legend.position = 'none'))
p[lobe] = list(plot)
}
subplot(p[[1]], p[[2]], p[[3]], p[[4]], nrows=2)
rm(p, datExpr_lobe, ASD_vs_CTL, plot)
Seems like the Parietal lobe is the only one with a significant number of DE genes
DE_by_region = function(datExpr, datMeta){
mod = model.matrix(~ Diagnosis_, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),]
}
DE_info_by_region = list()
i=1
for(lobe in names(table(datMeta$Brain_lobe))){
datExpr_lobe = datExpr %>% dplyr::select(which(datMeta$Brain_lobe==lobe))
datMeta_lobe = datMeta %>% filter(Brain_lobe==lobe)
DE_info = DE_by_region(datExpr_lobe, datMeta_lobe) %>% mutate('ID'=rownames(datExpr_lobe))
DE_info_by_region[[i]] = DE_info
i = i+1
print(glue(lobe,' lobe: ', sum(DE_info$adj.P.Val<0.05 & DE_info$logFC>log2(1.2)),
' DE genes'))
}
## Frontal lobe: 0 DE genes
## Temporal lobe: 0 DE genes
## Parietal lobe: 268 DE genes
## Occipital lobe: 4 DE genes
names(DE_info_by_region) = names(table(datMeta$Brain_lobe))
rm(i, lobe, datExpr_lobe, datMeta_lobe, DE_info)
PC1 explains the average expression and PC2 log fold change
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.95){
datExpr_pca = prcomp(datExpr, scale=TRUE)
last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>%
filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
var_explained*100, '% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}
lobe = 'Parietal'
datExpr_lobe = datExpr %>% dplyr::select(which(datMeta$Brain_lobe==lobe))
datMeta_lobe = datMeta %>% filter(Brain_lobe==lobe)
red_dim = reduce_dim_datExpr(datExpr_lobe, datMeta_lobe, 0.97)
## Keeping top 10 components that explain 97% of the variance
pca_lobe = red_dim$datExpr %>% mutate('ID' = DE_info_by_region[[lobe]]$ID) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, PC1, PC2, `gene-score`) %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(DE_info_by_region[[lobe]], by='ID') %>% mutate('abs_lFC'=abs(logFC))
selectable_scatter_plot(pca_lobe[,-1], pca_lobe[,-1])